{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 作业内容：\n",
    "\n",
    "> 1，设计并训练深度神经网络，对夸克胶子喷注进行分类"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 参考答案\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "from tensorflow import keras\n",
    "from tensorflow.keras import layers\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.model_selection import train_test_split\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#load data\n",
    "with np.load('./data/QG_jets.npz') as f:\n",
    "    raw_data = f['X']\n",
    "    label = f['y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data tensor has such shape: (100000, 139, 4)\n",
      "label tensor has such shape: (100000,)\n",
      "head 5 lines for event No.1: \n",
      "[[ 2.68769142e-01  3.56903171e-01  4.74138734e+00  2.20000000e+01]\n",
      " [ 1.60076377e-01 -2.55609533e-01  4.55022910e+00  2.20000000e+01]\n",
      " [ 1.14868731e+00 -6.24380156e-02  4.50385377e+00 -2.11000000e+02]\n",
      " [ 4.13159146e+00  1.73686350e-01  4.76622410e+00 -3.21000000e+02]\n",
      " [ 1.69599701e+00 -2.12177764e-01  4.79687162e+00 -2.11000000e+02]]\n"
     ]
    }
   ],
   "source": [
    "#check and know the data\n",
    "print('data tensor has such shape: {}'.format(raw_data.shape))\n",
    "print('label tensor has such shape: {}'.format(label.shape))\n",
    "print('head 5 lines for event No.1: \\n{}'.format(raw_data[0, :5, :]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#as we already know that a row for an event has such variables: [pt, y, phi, pid]\n",
    "#and our network wants a histogram of y and phi\n",
    "\n",
    "#define the transformation function\n",
    "def transform(data, bins=[28, 28], range_=[[-1.7, 1.7], [0, 2*np.pi]]):\n",
    "    r'''\n",
    "    Convert the raw data of 139 rows and 4 columns into a 28-by-28 2D heatmap.\n",
    "    '''\n",
    "    y = data[:, 1]\n",
    "    phi = data[:, 2]\n",
    "    res, _, _ = np.histogram2d(y, phi, bins=bins, range=range_, density=True)\n",
    "    return res\n",
    "\n",
    "#convert the data set\n",
    "data = np.array([transform(item) for item in raw_data])\n",
    "#do a simple scale\n",
    "data /= data.max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pre-processed data tensor has such shape: (100000, 28, 28)\n"
     ]
    }
   ],
   "source": [
    "#check and save our pre-processed data\n",
    "print('Pre-processed data tensor has such shape: {}'.format(data.shape))\n",
    "if not os.path.exists('data'):\n",
    "    os.mkdir('data')\n",
    "np.save('./data/QG_proc_data.npy', data)\n",
    "np.save('./data/QG_proc_label.npy', label)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train set: \n",
      "data: (90000, 28, 28, 1), label: (90000, 2)\n",
      "test set: \n",
      "data: (10000, 28, 28, 1), label: (10000, 2)\n"
     ]
    }
   ],
   "source": [
    "#I'll skip the former step for saving time, so load the pre-processed data directly\n",
    "data = np.load('./data/QG_proc_data.npy')\n",
    "label = np.load('./data/QG_proc_label.npy')\n",
    "\n",
    "#prepare the training and test set\n",
    "data = np.expand_dims(data, -1)#keras need one more dimension (for channel)\n",
    "label = keras.utils.to_categorical(label, 2)#one-hot trick\n",
    "data_train, data_test, label_train, label_test = train_test_split(data, label, test_size=0.1)\n",
    "\n",
    "print('train set: \\ndata: {}, label: {}'.format(data_train.shape, label_train.shape))\n",
    "print('test set: \\ndata: {}, label: {}'.format(data_test.shape, label_test.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING: Logging before flag parsing goes to stderr.\n",
      "W0506 14:22:01.650882  5196 deprecation.py:506] From C:\\Users\\YmeHweng\\Anaconda3\\lib\\site-packages\\tensorflow\\python\\ops\\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"sequential\"\n",
      "_________________________________________________________________\n",
      "Layer (type)                 Output Shape              Param #   \n",
      "=================================================================\n",
      "conv2d (Conv2D)              (None, 26, 26, 32)        320       \n",
      "_________________________________________________________________\n",
      "max_pooling2d (MaxPooling2D) (None, 13, 13, 32)        0         \n",
      "_________________________________________________________________\n",
      "conv2d_1 (Conv2D)            (None, 11, 11, 64)        18496     \n",
      "_________________________________________________________________\n",
      "max_pooling2d_1 (MaxPooling2 (None, 5, 5, 64)          0         \n",
      "_________________________________________________________________\n",
      "flatten (Flatten)            (None, 1600)              0         \n",
      "_________________________________________________________________\n",
      "dropout (Dropout)            (None, 1600)              0         \n",
      "_________________________________________________________________\n",
      "dense (Dense)                (None, 2)                 3202      \n",
      "=================================================================\n",
      "Total params: 22,018\n",
      "Trainable params: 22,018\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "#construct the model\n",
    "num_classes = 2\n",
    "input_shape = (28, 28, 1)\n",
    "model = keras.Sequential(\n",
    "    [\n",
    "        keras.Input(shape=input_shape),\n",
    "        layers.Conv2D(32, kernel_size=(3, 3), activation='relu'),\n",
    "        layers.MaxPooling2D(pool_size=(2, 2)),\n",
    "        layers.Conv2D(64, kernel_size=(3, 3), activation='relu'),\n",
    "        layers.MaxPooling2D(pool_size=(2)),\n",
    "        layers.Flatten(),\n",
    "        layers.Dropout(0.5),\n",
    "        layers.Dense(num_classes, activation='softmax'),\n",
    "    ]\n",
    ")\n",
    "\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 81000 samples, validate on 9000 samples\n",
      "Epoch 1/8\n",
      "81000/81000 - 21s - loss: 0.5678 - acc: 0.7097 - val_loss: 0.5063 - val_acc: 0.7554\n",
      "Epoch 2/8\n",
      "81000/81000 - 21s - loss: 0.5143 - acc: 0.7546 - val_loss: 0.4974 - val_acc: 0.7582\n",
      "Epoch 3/8\n",
      "81000/81000 - 21s - loss: 0.5084 - acc: 0.7583 - val_loss: 0.4940 - val_acc: 0.7634\n",
      "Epoch 4/8\n",
      "81000/81000 - 21s - loss: 0.5059 - acc: 0.7595 - val_loss: 0.4967 - val_acc: 0.7646\n",
      "Epoch 5/8\n",
      "81000/81000 - 21s - loss: 0.5067 - acc: 0.7595 - val_loss: 0.4928 - val_acc: 0.7644\n",
      "Epoch 6/8\n",
      "81000/81000 - 20s - loss: 0.5058 - acc: 0.7607 - val_loss: 0.4953 - val_acc: 0.7639\n",
      "Epoch 7/8\n",
      "81000/81000 - 21s - loss: 0.5042 - acc: 0.7610 - val_loss: 0.4921 - val_acc: 0.7644\n",
      "Epoch 8/8\n",
      "81000/81000 - 21s - loss: 0.5053 - acc: 0.7598 - val_loss: 0.4942 - val_acc: 0.7634\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.keras.callbacks.History at 0x1bd99d5df98>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#compile and train the model\n",
    "batch_size = 128\n",
    "epochs = 8\n",
    "model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
    "\n",
    "model.fit(data_train, label_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, verbose=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#save the weights\n",
    "if not os.path.exists('FNNmodel'):\n",
    "    os.mkdir('FNNmodel')\n",
    "model.save_weights('./FNNmodel/simple.h5')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy on test set: 0.7676\n"
     ]
    }
   ],
   "source": [
    "#load the pre-trained weights and test\n",
    "model.load_weights('./FNNmodel/simple.h5')\n",
    "#check the accuracy\n",
    "test_pred = model.predict(data_test)\n",
    "test_pred = np.where(test_pred>0.5, 1, 0)\n",
    "acc = ((test_pred == label_test).sum()/2) / label_test.shape[0]\n",
    "\n",
    "print('Accuracy on test set: {}'.format(acc))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
